import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from pandasql import sqldf
import plotly.express as px
import pandas as pd
df = pd.read_csv("Documents//Python//bank_transactions.csv")
df.head()
| TransactionID | CustomerID | CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount (INR) | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | T1 | C5841053 | 10/1/94 | F | JAMSHEDPUR | 17819.05 | 2/8/16 | 143207 | 25.0 |
| 1 | T2 | C2142763 | 4/4/57 | M | JHAJJAR | 2270.69 | 2/8/16 | 141858 | 27999.0 |
| 2 | T3 | C4417068 | 26/11/96 | F | MUMBAI | 17874.44 | 2/8/16 | 142712 | 459.0 |
| 3 | T4 | C5342380 | 14/9/73 | F | MUMBAI | 866503.21 | 2/8/16 | 142714 | 2060.0 |
| 4 | T5 | C9031234 | 24/3/88 | F | NAVI MUMBAI | 6714.43 | 2/8/16 | 181156 | 1762.5 |
df.rename(columns = {'TransactionAmount (INR)':'TransactionAmount'}, inplace = True)
df.head()
| TransactionID | CustomerID | CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | T1 | C5841053 | 10/1/94 | F | JAMSHEDPUR | 17819.05 | 2/8/16 | 143207 | 25.0 |
| 1 | T2 | C2142763 | 4/4/57 | M | JHAJJAR | 2270.69 | 2/8/16 | 141858 | 27999.0 |
| 2 | T3 | C4417068 | 26/11/96 | F | MUMBAI | 17874.44 | 2/8/16 | 142712 | 459.0 |
| 3 | T4 | C5342380 | 14/9/73 | F | MUMBAI | 866503.21 | 2/8/16 | 142714 | 2060.0 |
| 4 | T5 | C9031234 | 24/3/88 | F | NAVI MUMBAI | 6714.43 | 2/8/16 | 181156 | 1762.5 |
df.describe()
| CustAccountBalance | TransactionTime | TransactionAmount | |
|---|---|---|---|
| count | 1.046198e+06 | 1.048567e+06 | 1.048567e+06 |
| mean | 1.154035e+05 | 1.570875e+05 | 1.574335e+03 |
| std | 8.464854e+05 | 5.126185e+04 | 6.574743e+03 |
| min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 4.721760e+03 | 1.240300e+05 | 1.610000e+02 |
| 50% | 1.679218e+04 | 1.642260e+05 | 4.590300e+02 |
| 75% | 5.765736e+04 | 2.000100e+05 | 1.200000e+03 |
| max | 1.150355e+08 | 2.359590e+05 | 1.560035e+06 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1048567 entries, 0 to 1048566 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TransactionID 1048567 non-null object 1 CustomerID 1048567 non-null object 2 CustomerDOB 1045170 non-null object 3 CustGender 1047467 non-null object 4 CustLocation 1048416 non-null object 5 CustAccountBalance 1046198 non-null float64 6 TransactionDate 1048567 non-null object 7 TransactionTime 1048567 non-null int64 8 TransactionAmount 1048567 non-null float64 dtypes: float64(2), int64(1), object(6) memory usage: 72.0+ MB
df.isnull().sum()
TransactionID 0 CustomerID 0 CustomerDOB 3397 CustGender 1100 CustLocation 151 CustAccountBalance 2369 TransactionDate 0 TransactionTime 0 TransactionAmount 0 dtype: int64
100*(df.isnull().sum()/len(df))
TransactionID 0.000000 CustomerID 0.000000 CustomerDOB 0.323966 CustGender 0.104905 CustLocation 0.014401 CustAccountBalance 0.225927 TransactionDate 0.000000 TransactionTime 0.000000 TransactionAmount 0.000000 dtype: float64
df[df['CustGender'].isnull()].head(10)
| TransactionID | CustomerID | CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|---|---|
| 176 | T177 | C5698953 | 1/1/1800 | NaN | NAVI MUMBAI | 8512.28 | 12/8/16 | 155727 | 3449.00 |
| 2124 | T2125 | C4525944 | 1/1/1800 | NaN | BANGALORE | 7115.05 | 21/10/16 | 201052 | 584.06 |
| 5061 | T5062 | C7541851 | 1/1/1800 | NaN | MUMBAI | 10669.85 | 22/9/16 | 61227 | 1500.00 |
| 13035 | T13036 | C3788162 | 1/1/1800 | NaN | MULUND WEST | 65989.63 | 26/9/16 | 210819 | 336.52 |
| 14878 | T14879 | C6239153 | 1/1/1800 | NaN | NAVI MUMBAI | 603.00 | 26/9/16 | 163122 | 1494.00 |
| 15314 | T15315 | C5793145 | 12/12/50 | NaN | MUMBAI | 597.00 | 26/9/16 | 92928 | 1.00 |
| 15768 | T15769 | C8891925 | 1/1/1800 | NaN | NAVI MUMBAI | 31691.58 | 26/9/16 | 205542 | 4448.64 |
| 16788 | T16789 | C3893120 | 12/12/50 | NaN | MUMBAI | 597.00 | 26/9/16 | 93137 | 1.00 |
| 17300 | T17301 | C7093134 | 12/12/50 | NaN | MUMBAI | 597.00 | 26/9/16 | 93033 | 1.00 |
| 17942 | T17943 | C5239087 | 1/1/1800 | NaN | MUMBAI | 97178.78 | 26/9/16 | 155706 | 3310.00 |
df[df['CustAccountBalance'].isnull()].head()
| TransactionID | CustomerID | CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|---|---|
| 29 | T30 | C8736958 | 19/12/86 | M | GRAM PANCHAYTH HYDERABAD | NaN | 3/8/16 | 160636 | 2600.0 |
| 1271 | T1272 | C2314972 | 29/7/93 | M | THANE | NaN | 21/10/16 | 203227 | 53.0 |
| 2513 | T2514 | C3742855 | 18/3/87 | F | MUMBAI | NaN | 21/10/16 | 150602 | 300.0 |
| 2831 | T2832 | C3221342 | 27/2/02 | M | NEW DELHI | NaN | 21/10/16 | 54746 | 1.0 |
| 3550 | T3551 | C5921379 | 27/2/02 | M | NEW DELHI | NaN | 21/10/16 | 54704 | 1.0 |
newdf = df.dropna(axis=0, inplace = False) # since there are over a million records dropping a few rows with null values would not significantly impact the analysis
newdf.isnull().sum()
TransactionID 0 CustomerID 0 CustomerDOB 0 CustGender 0 CustLocation 0 CustAccountBalance 0 TransactionDate 0 TransactionTime 0 TransactionAmount 0 dtype: int64
print(newdf.shape,df.shape)
(1041614, 9) (1048567, 9)
print(len(newdf)-len(df))
-6953
newdf.drop(['TransactionID','CustomerID'],axis = 1, inplace = True) ## Removing TransactionID and CustomerID) #as I feel there are no that useful
/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py:4906: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
newdf.head()
| CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|
| 0 | 10/1/94 | F | JAMSHEDPUR | 17819.05 | 2/8/16 | 143207 | 25.0 |
| 1 | 4/4/57 | M | JHAJJAR | 2270.69 | 2/8/16 | 141858 | 27999.0 |
| 2 | 26/11/96 | F | MUMBAI | 17874.44 | 2/8/16 | 142712 | 459.0 |
| 3 | 14/9/73 | F | MUMBAI | 866503.21 | 2/8/16 | 142714 | 2060.0 |
| 4 | 24/3/88 | F | NAVI MUMBAI | 6714.43 | 2/8/16 | 181156 | 1762.5 |
newdf.corr() # there is no significance relation between variables
| CustAccountBalance | TransactionTime | TransactionAmount | |
|---|---|---|---|
| CustAccountBalance | 1.000000 | -0.004012 | 0.062692 |
| TransactionTime | -0.004012 | 1.000000 | 0.008296 |
| TransactionAmount | 0.062692 | 0.008296 | 1.000000 |
newdf.cov()
| CustAccountBalance | TransactionTime | TransactionAmount | |
|---|---|---|---|
| CustAccountBalance | 7.170041e+11 | -1.741558e+08 | 3.483155e+08 |
| TransactionTime | -1.741558e+08 | 2.627948e+09 | 2.790504e+06 |
| TransactionAmount | 3.483155e+08 | 2.790504e+06 | 4.305281e+07 |
newdf.CustomerDOB.value_counts() # abnormally high number of people have birth year as 1800
1/1/1800 56292
1/1/89 809
1/1/90 784
6/8/91 698
1/1/91 665
...
2/12/51 1
20/3/52 1
26/9/47 1
4/10/41 1
24/10/44 1
Name: CustomerDOB, Length: 17233, dtype: int64
newdf = newdf.loc[~(newdf['CustomerDOB'] == '1/1/1800')] # Removing them since it is probably a mistake in the entry books
newdf.CustomerDOB.value_counts()
1/1/89 809
1/1/90 784
6/8/91 698
1/1/91 665
1/1/92 631
...
23/2/05 1
28/11/42 1
23/9/49 1
14/3/40 1
24/10/44 1
Name: CustomerDOB, Length: 17232, dtype: int64
newdf['TransactionDate'] = pd.to_datetime(newdf['TransactionDate'], format = '%d/%m/%y')
newdf['CustomerDOB'] = pd.to_datetime(newdf['CustomerDOB'], format = '%d/%m/%y')
newdf.CustomerDOB # Year 2057 ??
0 1994-01-10
1 2057-04-04
2 1996-11-26
3 1973-09-14
4 1988-03-24
...
1048562 1990-04-08
1048563 1992-02-20
1048564 1989-05-18
1048565 1978-08-30
1048566 1984-03-05
Name: CustomerDOB, Length: 985322, dtype: datetime64[ns]
newdf.loc[newdf['CustomerDOB'].dt.year >= 2021, ['CustomerDOB']] -= pd.DateOffset(years = 100)
newdf.head() # Subtracted 100 years from the records that were wrongly converted!
| CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|
| 0 | 1994-01-10 | F | JAMSHEDPUR | 17819.05 | 2016-08-02 | 143207 | 25.0 |
| 1 | 1957-04-04 | M | JHAJJAR | 2270.69 | 2016-08-02 | 141858 | 27999.0 |
| 2 | 1996-11-26 | F | MUMBAI | 17874.44 | 2016-08-02 | 142712 | 459.0 |
| 3 | 1973-09-14 | F | MUMBAI | 866503.21 | 2016-08-02 | 142714 | 2060.0 |
| 4 | 1988-03-24 | F | NAVI MUMBAI | 6714.43 | 2016-08-02 | 181156 | 1762.5 |
newdf['Age'] = round((pd.to_datetime('today') - newdf['CustomerDOB'])/np.timedelta64(1, 'Y'))
newdf.shape
(985322, 8)
newdf.head()
| CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | Age | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1994-01-10 | F | JAMSHEDPUR | 17819.05 | 2016-08-02 | 143207 | 25.0 | 28.0 |
| 1 | 1957-04-04 | M | JHAJJAR | 2270.69 | 2016-08-02 | 141858 | 27999.0 | 65.0 |
| 2 | 1996-11-26 | F | MUMBAI | 17874.44 | 2016-08-02 | 142712 | 459.0 | 25.0 |
| 3 | 1973-09-14 | F | MUMBAI | 866503.21 | 2016-08-02 | 142714 | 2060.0 | 48.0 |
| 4 | 1988-03-24 | F | NAVI MUMBAI | 6714.43 | 2016-08-02 | 181156 | 1762.5 | 34.0 |
newdf['TransactionAmount'].describe() # As the data is huge we try to take values in transaction above median.
count 9.853220e+05 mean 1.452425e+03 std 6.139765e+03 min 0.000000e+00 25% 1.510000e+02 50% 4.310000e+02 75% 1.125000e+03 max 1.560035e+06 Name: TransactionAmount, dtype: float64
Loc_TA = sqldf('select Age,CustAccountBalance,CustGender,CustLocation,TransactionAmount from newdf where TransactionAmount > 500 group by CustLocation order by TransactionAmount desc')
Loc_TA
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 42.0 | 798753.95 | M | GATE NO 4 MUMBAI | 455122.00 |
| 1 | 35.0 | 90123.52 | M | JAIPUR DURGAPURA | 378415.46 |
| 2 | 45.0 | 215315.01 | M | ROOMFORD | 117621.00 |
| 3 | 44.0 | 43113.77 | M | MILTON KEYNES | 104478.00 |
| 4 | 38.0 | 73542.33 | M | PHASE-01 HINJEWADI PUNE | 100000.00 |
| ... | ... | ... | ... | ... | ... |
| 6662 | 38.0 | 8387.04 | M | NAGAR CHENNAI | 501.27 |
| 6663 | 31.0 | 5075.39 | M | THALAYOLAPARAMBU | 501.00 |
| 6664 | 30.0 | 76438.86 | F | PETHAPUR GANDHI NAGAR | 501.00 |
| 6665 | 70.0 | 90260.19 | M | DAHISAR E MUMBAI | 501.00 |
| 6666 | 32.0 | 494674.28 | F | BORIVALI E | 501.00 |
6667 rows × 5 columns
Loc_TA ['TransactionAmount'].median()
1298.0
Loc_TA.shape
(6667, 5)
# import plotly.express as px
fig = px.scatter(Loc_TA, x='Age', y='TransactionAmount',color ='CustGender')
fig.update_layout(title_text="Age vs. Transaction amount",title_x=0.5)
fig.show()
Loc_TA_filter= Loc_TA[Loc_TA['TransactionAmount']< 50000]
# import plotly.express as px
fig = px.scatter(Loc_TA_filter, x='Age', y='TransactionAmount',color ='CustGender')
fig.update_layout(title_text="Age vs. Transaction amount",title_x=0.5)
fig.show()
labelencoder = LabelEncoder()
Loc_TA['CustGender'] = labelencoder.fit_transform(Loc_TA['CustGender'])
Loc_TA['CustLocation'] = labelencoder.fit_transform(Loc_TA['CustLocation'])
Loc_TA.head()
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 42.0 | 798753.95 | 1 | 2060 | 455122.00 |
| 1 | 35.0 | 90123.52 | 1 | 2542 | 378415.46 |
| 2 | 45.0 | 215315.01 | 1 | 5140 | 117621.00 |
| 3 | 44.0 | 43113.77 | 1 | 3700 | 104478.00 |
| 4 | 38.0 | 73542.33 | 1 | 4673 | 100000.00 |
scaler = StandardScaler()
Loc_TA1= scaler.fit_transform(Loc_TA)
Loc_TA1
array([[ 5.27984731e-01, 1.25170860e+00, 5.91727773e-01,
-6.61437139e-01, 4.86905810e+01],
[-2.57470607e-01, -2.06569146e-02, 5.91727773e-01,
-4.10995112e-01, 4.04302057e+01],
[ 8.64608447e-01, 2.04127864e-01, 5.91727773e-01,
9.38897808e-01, 1.23457670e+01],
...,
[-8.18510134e-01, -4.52280996e-02, -1.68996631e+00,
6.84299067e-01, -2.66653699e-01],
[ 3.66980608e+00, -2.04115198e-02, 5.91727773e-01,
-9.90337146e-01, -2.66653699e-01],
[-5.94094323e-01, 7.05725152e-01, -1.68996631e+00,
-1.25324932e+00, -2.66653699e-01]])
saved_cols = Loc_TA.columns
Loc_TA1 = pd.DataFrame(Loc_TA1, columns = saved_cols) # Transformin the data from array to Data Frame
Loc_TA1
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 0.527985 | 1.251709 | 0.591728 | -0.661437 | 48.690581 |
| 1 | -0.257471 | -0.020657 | 0.591728 | -0.410995 | 40.430206 |
| 2 | 0.864608 | 0.204128 | 0.591728 | 0.938898 | 12.345767 |
| 3 | 0.752401 | -0.105064 | 0.591728 | 0.190689 | 10.930423 |
| 4 | 0.079153 | -0.050429 | 0.591728 | 0.696250 | 10.448196 |
| ... | ... | ... | ... | ... | ... |
| 6662 | 0.079153 | -0.167417 | 0.591728 | 0.316949 | -0.266625 |
| 6663 | -0.706302 | -0.173363 | 0.591728 | 1.367039 | -0.266654 |
| 6664 | -0.818510 | -0.045228 | -1.689966 | 0.684299 | -0.266654 |
| 6665 | 3.669806 | -0.020412 | 0.591728 | -0.990337 | -0.266654 |
| 6666 | -0.594094 | 0.705725 | -1.689966 | -1.253249 | -0.266654 |
6667 rows × 5 columns
#Finding the optimum number of clusters for k-means classification - ELBOW METHOD
# from sklearn.cluster import KMeans
# import matplotlib.pyplot as plt
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 500, n_init = 10, random_state = 0)
kmeans.fit(Loc_TA1)
wcss.append(kmeans.inertia_)
#Plotting the results onto a line graph, allowing us to observe 'The elbow'
plt.plot(range(1, 11), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') #within cluster sum of squares
plt.show()
# from sklearn.cluster import KMeans
# cluster 6
kmeans = KMeans(n_clusters=6)
kmeans.fit(Loc_TA1)
y_kmeans = kmeans.predict(Loc_TA1)
y_kmeans
array([2, 2, 5, ..., 3, 5, 3], dtype=int32)
df3 = Loc_TA1.assign(clusterNo = y_kmeans, namex = Loc_TA1['CustGender'])
print(df3.head())
Age CustAccountBalance CustGender CustLocation TransactionAmount \ 0 0.527985 1.251709 0.591728 -0.661437 48.690581 1 -0.257471 -0.020657 0.591728 -0.410995 40.430206 2 0.864608 0.204128 0.591728 0.938898 12.345767 3 0.752401 -0.105064 0.591728 0.190689 10.930423 4 0.079153 -0.050429 0.591728 0.696250 10.448196 clusterNo namex 0 2 0.591728 1 2 0.591728 2 5 0.591728 3 5 0.591728 4 5 0.591728
Loc_TA1.head()
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 0.527985 | 1.251709 | 0.591728 | -0.661437 | 48.690581 |
| 1 | -0.257471 | -0.020657 | 0.591728 | -0.410995 | 40.430206 |
| 2 | 0.864608 | 0.204128 | 0.591728 | 0.938898 | 12.345767 |
| 3 | 0.752401 | -0.105064 | 0.591728 | 0.190689 | 10.930423 |
| 4 | 0.079153 | -0.050429 | 0.591728 | 0.696250 | 10.448196 |
# OTHER TECH: HIERACHICAL DENDROGRAM
# from scipy.cluster.hierarchy import dendrogram, linkage
# from matplotlib import pyplot as plt
linked = linkage(Loc_TA1, 'average')
#linked
#print(type(linked))
plt.figure(figsize=(30, 30))
zx = dendrogram(linked, orientation='top',labels=Loc_TA1.index, color_threshold=60, above_threshold_color='red')
plt.show()
plt.figure(figsize=(30, 30))
zx = dendrogram(linked, truncate_mode='lastp',p = 100,leaf_rotation=90.,
leaf_font_size=12.,show_contracted=True,orientation='top',labels=Loc_TA1.index, color_threshold=60, above_threshold_color='red')
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.show()